Description

Objects that are used in this notebook are generated with 01-Exploratory_analysis_and_differential_expression nb and loaded with RDS. Here we connect mRNA - mir and mRNA - DNAm data. We are exploring do miR and DNAm regulate the the expression of mRNA to do so we need to identify the targets of those kinds of regulations.
For miR targets we query the database of mir-gene inrerctions mirTarBase that have experimentally supported evidence. DNAm targets are determined as differentially methylated cpg islands in promoter regions of genes. For this analysiso only significant results with adjsuted p-values < 0.05 are considered.

Libraries

library(multiMiR)
library(gprofiler2)
library(knitr)
library(dplyr)

Loading data**

# mRNA
mRNA_dds <- readRDS("outputs/RDS/mRNA_dds.rds")
mRNA_dds_HPV <- readRDS("outputs/RDS/mRNA_dds_HPV.rds")
mRNA_res <- readRDS("outputs/RDS/mRNA_res.rds")
mRNA_res_HPV <- readRDS("outputs/RDS/mRNA_res_HPV.rds")

# mir
mir_dds <- readRDS("outputs/RDS/mir_dds.rds")
mir_dds_HPV <- readRDS("outputs/RDS/mir_dds_HPV.rds")
mir_res <- readRDS("outputs/RDS/mir_res.rds")
mir_res_HPV <- readRDS("outputs/RDS/mir_res_HPV.rds")

# DNAm
rnb_set <- readRDS("outputs/RDS/rnb_set.rds")
dnam_res_cgi <- readRDS("outputs/RDS/dnam_res_cgi.rds")

miR - mRNA

Interactions

Interaction tables are obtained by querying with multimir package. Setting up the query.

# Significant mir's
mir_query <- rownames(dplyr::filter(as.data.frame(mir_res), padj < 0.05))

# Significant mRNA
mRNA_query <- rownames(dplyr::filter(as.data.frame(mRNA_res), padj < 0.05))

print(paste("Length of mir query: ", length(mir_query),
             "   Length of mRNA query: ", length(mRNA_query)))
## [1] "Length of mir query:  548    Length of mRNA query:  2529"

Querying the database

database <- "mirtarbase"

# Querying
multi_db <- get_multimir(mirna = mir_query,
                         target = mRNA_query,
                         table = database)
## Searching mirtarbase ...
# Selecting results
targ_mir_mRNA <- 
  multiMiR::select(multi_db,
                   keytype = "type",
                   keys = "validated",
                   columns = c("mature_mirna_id", 
                               "target_ensembl",
                               "target_symbol"))

# Removing duplicates and removing type column (redundant)
targ_mir_mRNA <- unique(dplyr::select(targ_mir_mRNA, -type))

rm(multi_db)

head(targ_mir_mRNA)
##   mature_mirna_id target_symbol  target_ensembl
## 1  hsa-miR-26b-5p        ACADSB ENSG00000196177
## 2   hsa-miR-93-5p        ACADSB ENSG00000196177
## 3   hsa-miR-21-5p         ACAT1 ENSG00000075239
## 4    hsa-miR-1-3p         ACTA1 ENSG00000143632
## 5   hsa-miR-98-5p         ACTA1 ENSG00000143632
## 6   hsa-let-7d-5p         ACTA1 ENSG00000143632
print(c(paste("Number of mRNA mir interactions: ", dim(targ_mir_mRNA)[1]),
        paste("Number of mirs in interactions: ",
              length(unique(targ_mir_mRNA$mature_mirna_id))),
        paste("Number of target genes :", 
              length(unique(targ_mir_mRNA$target_ensembl)))))
## [1] "Number of mRNA mir interactions:  3484"
## [2] "Number of mirs in interactions:  102"  
## [3] "Number of target genes : 1272"
# Frequency table of mir's in the interaction table
table(targ_mir_mRNA$mature_mirna_id) %>%
  as.data.frame() %>%
  arrange(desc(Freq)) %>%
  head()
##              Var1 Freq
## 1  hsa-miR-26b-5p  231
## 2    hsa-miR-1-3p  144
## 3   hsa-miR-93-5p  142
## 4 hsa-miR-106b-5p  127
## 5   hsa-miR-24-3p  118
## 6  hsa-miR-615-3p   93
# Frequency of genes in interaction table
head(table(targ_mir_mRNA$target_symbol) %>%
  as.data.frame() %>%
  arrange(desc(Freq)))
##     Var1 Freq
## 1    MYC   20
## 2   BCL2   19
## 3 CDKN1A   18
## 4   MDM4   16
## 5 TUBB2A   16
## 6   ZEB1   16

GSEA analysis on genes under mir regulation. Around half of DE genes are targets for mir regulation. Terms that appear in GSEA analysis are involved in regulation of cellular processes, regulation of transcription, there is a KEGG term proteoglyncans in cancer, and several REACTOME terms involved in cell cycle regulation.

# Using only part of sources
GSEA_SOURCES <- c("GO:BP","CORUM","KEGG","REAC","WP","MIRNA", "TF")

# Calling gost function
gost_res <- 
    gost(targ_mir_mRNA$target_ensembl,
       organism= "hsapiens",
       exclude_iea=TRUE,
       domain_scope="annotated",
       ordered_query=F,
       sources=GSEA_SOURCES)

# Change query name
gost_res$result$query <- "Mir targets on whole genome background"

gostplot(gost_res, capped = FALSE)

HPV Effect Querying the database for mirs and mRNA that are DE in the HPV group.

Building queries

# building mir query
mir_HPV_query <- filter(as.data.frame(mir_res_HPV), pvalue < 0.05)$row

# building mRNA query
mRNA_HPV_query <- filter(as.data.frame(mRNA_res_HPV), pvalue < 0.05)$row

print(paste("Length of mir query: ", length(mir_HPV_query),
             "   Length of mRNA query: ", length(mRNA_HPV_query)))
## [1] "Length of mir query:  20    Length of mRNA query:  44"

Querying all databases, predicted and validated since there are not many hits.

# querying all databases that contain predicted interactions
multi_db <- get_multimir(mirna = mir_HPV_query,
                         target = mRNA_HPV_query,
                         table = "predicted")
## Searching diana_microt ...
## Searching elmmo ...
## Searching microcosm ...
## Searching miranda ...
## Searching mirdb ...
## Searching pictar ...
## Searching pita ...
## Searching targetscan ...
# Selecting columns
targ_mir_mRNA_HPV <- 
  multiMiR::select(multi_db,
                   keytype = "type",
                   keys = "predicted",
                   columns = c("mature_mirna_id", 
                               "target_ensembl",
                               "target_symbol"))

# Selecting only unique interaction, dropping type
targ_mir_mRNA_HPV <-unique(dplyr::select(targ_mir_mRNA_HPV, -type))

targ_mir_mRNA_HPV
##    mature_mirna_id target_symbol  target_ensembl
## 1     hsa-miR-9-3p        TFAP2B ENSG00000008196
## 3     hsa-miR-9-5p       JAKMIP2 ENSG00000176049
## 4     hsa-miR-9-5p          NEFH ENSG00000100285
## 6   hsa-miR-512-3p        UGT1A1 ENSG00000241635
## 7     hsa-miR-9-5p      C11orf88 ENSG00000183644
## 8  hsa-miR-106a-5p          NEFH ENSG00000100285
## 10  hsa-miR-223-3p          NEFH ENSG00000100285
## 12  hsa-miR-512-3p          ZFR2 ENSG00000105278
## 14 hsa-miR-106a-5p          ZFR2 ENSG00000105278
## 16    hsa-miR-9-3p       JAKMIP2 ENSG00000176049
## 23    hsa-miR-9-5p         PTPRQ ENSG00000139304
## 27    hsa-miR-9-5p       TSPAN18 ENSG00000157570
## 29    hsa-miR-9-3p       ADCYAP1 ENSG00000141433
## 31    hsa-miR-9-3p       TSPAN18 ENSG00000157570
## 32    hsa-miR-9-3p         KCNS1 ENSG00000124134
## 35    hsa-miR-9-3p         UPK1B ENSG00000114638
## 36    hsa-miR-9-3p         ARNT2 ENSG00000172379
## 37    hsa-miR-9-3p         ATP9B ENSG00000166377
## 41    hsa-miR-9-5p          PAX8 ENSG00000125618
## 43  hsa-miR-223-3p       ADCYAP1 ENSG00000141433
## 45  hsa-miR-20b-5p          NEFH ENSG00000100285
## 49  hsa-miR-512-3p         UPK1B ENSG00000114638
## 50  hsa-miR-512-3p       HSD17B3 ENSG00000130948
## 52  hsa-miR-512-3p        CASP14 ENSG00000105141
## 53  hsa-miR-512-3p         ARNT2 ENSG00000172379
## 59  hsa-miR-512-3p          PAX8 ENSG00000125618
## 61  hsa-miR-512-3p         KCNS1 ENSG00000124134
## 63    hsa-miR-9-5p        CASP14 ENSG00000105141
## 65    hsa-miR-9-5p         TCP11 ENSG00000124678
## 66    hsa-miR-9-5p       HSD17B3 ENSG00000130948
## 70    hsa-miR-9-3p         PTPRQ ENSG00000139304
## 79    hsa-miR-9-5p        TFAP2B ENSG00000008196
print(c(paste("Number of mRNA mir interactions: ", dim(targ_mir_mRNA_HPV)[1]),
        paste("Number of mirs in interactions: ",
              length(unique(targ_mir_mRNA_HPV$mature_mirna_id))),
        paste("Number of target genes :", 
              length(unique(targ_mir_mRNA_HPV$target_ensembl)))))
## [1] "Number of mRNA mir interactions:  32"
## [2] "Number of mirs in interactions:  6"  
## [3] "Number of target genes : 17"
# Frequency table of mir's in the interaction table
table(targ_mir_mRNA_HPV$mature_mirna_id) %>%
  as.data.frame()%>%
  arrange(desc(Freq))
##              Var1 Freq
## 1    hsa-miR-9-5p   10
## 2    hsa-miR-9-3p    9
## 3  hsa-miR-512-3p    8
## 4 hsa-miR-106a-5p    2
## 5  hsa-miR-223-3p    2
## 6  hsa-miR-20b-5p    1
# Frequency of genes in interaction table
table(targ_mir_mRNA_HPV$target_symbol) %>%
  as.data.frame()%>%
  arrange(desc(Freq))
##        Var1 Freq
## 1      NEFH    4
## 2   ADCYAP1    2
## 3     ARNT2    2
## 4    CASP14    2
## 5   HSD17B3    2
## 6   JAKMIP2    2
## 7     KCNS1    2
## 8      PAX8    2
## 9     PTPRQ    2
## 10   TFAP2B    2
## 11  TSPAN18    2
## 12    UPK1B    2
## 13     ZFR2    2
## 14    ATP9B    1
## 15 C11orf88    1
## 16    TCP11    1
## 17   UGT1A1    1

GSEA on genes that are regulated in HPV groups.

# Using only part of sources
GSEA_SOURCES <- c("GO:BP","CORUM","KEGG","REAC","WP","MIRNA", "TF")

# Calling gost function
gost_res <- 
    gost(targ_mir_mRNA_HPV$target_ensembl,
       organism= "hsapiens",
       exclude_iea=TRUE,
       domain_scope="annotated",
       ordered_query=F,
       sources=GSEA_SOURCES)

# Change query name
gost_res$result$query <- "Mir HPV targets on whole genome background"

gostplot(gost_res, capped = FALSE)

Correlation

Testing count correlation for mir and mRNA for interaction pairs. Input are normalized count data for the 13 intersecting samples. This table will be used for picking the best mir to attach in the integrative table.

# Data frame with correlations
cor_mir <- 
  dplyr::select(targ_mir_mRNA,
                mir = mature_mirna_id,
                symbol = target_symbol,
                gene = target_ensembl)

# Looping through all the pairs.
for(i in 1:dim(cor_mir)[1]){
  if(i%%100 ==0){
    message(paste("Processing pair number: ",i,
                  " mir: ",cor_mir$mir[i],
                  " gene ",  cor_mir$symbol[i]))
  }
  c <- 
    cor.test(x=as.numeric(
      counts(mir_dds, normalize=TRUE)[cor_mir$mir[i],colnames(mRNA_dds)]),
      y=as.numeric(
        counts(mRNA_dds, normalize=TRUE)[cor_mir$gene[i],]
      ))
  cor_mir$r[i] <- c$estimate
  cor_mir$pvalue[i] <- c$p.value
}

# Saving cor_mir object into RDS because of long computing time
saveRDS(cor_mir, "outputs/RDS/cor_mir.rds")
cor_mir <- readRDS("outputs/RDS/cor_mir.rds")
head(dplyr::arrange(cor_mir,pvalue))
##               mir symbol            gene         r       pvalue
## 1 hsa-miR-133a-3p   MYPN ENSG00000138347 0.9997440 1.872519e-19
## 2 hsa-miR-133a-3p   TPM1 ENSG00000140416 0.9989469 4.464368e-16
## 3  hsa-miR-381-3p  MYOM2 ENSG00000036448 0.9959761 7.067478e-13
## 4  hsa-miR-381-3p  ASB11 ENSG00000165192 0.9938453 7.287883e-12
## 5 hsa-miR-376c-3p  RBM20 ENSG00000203867 0.9915593 4.122337e-11
## 6 hsa-miR-376c-3p   ADH4 ENSG00000198099 0.9909142 6.173350e-11

DNAm - mRNA

BUilding an DNA methylation and mRNA table. As input for DNAm a granges object with cpg island intervals and a column that contains gene names whose promoter region overlaps that cpg island.

mRNA_res_sub <- dplyr::filter(as.data.frame(mRNA_res), padj < 0.05)

targ_dnam_mRNA <- 
  plyranges::filter(dnam_res_cgi, ensembl %in% rownames(mRNA_res_sub))

Some of the cpg (19) islands overlap more than one gene promoter, and 72 genes have 2 (exactly) cpg islands in their promoters.

GSEA GSEA analysis on genes that are under methylation regulation. There many terms from TF source which is expected. Since the differentially methylated cpg islands that were selected are in promoter regions of those genes. Also in from the reactom database there are many terms involved in the regulation of the cell cycle.

# Using only part of sources
GSEA_SOURCES <- c("GO:BP","CORUM","KEGG","REAC","WP","MIRNA", "TF")

# Calling gost function
gost_res <- 
    gost(targ_dnam_mRNA$ensembl,
       organism= "hsapiens",
       exclude_iea=TRUE,
       domain_scope="annotated",
       ordered_query=F,
       sources=GSEA_SOURCES)

# Change query name
gost_res$result$query <- "DE genes with DM promoters"

gostplot(gost_res, capped = FALSE)

Session Info

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=hr_HR.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=hr_HR.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=hr_HR.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=hr_HR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] dplyr_1.0.7      knitr_1.36       gprofiler2_0.2.1 multiMiR_1.14.0 
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7                matrixStats_0.61.0         
##  [3] bit64_4.0.5                 RColorBrewer_1.1-2         
##  [5] httr_1.4.2                  GenomeInfoDb_1.28.4        
##  [7] tools_4.1.2                 bslib_0.3.1                
##  [9] utf8_1.2.2                  R6_2.5.1                   
## [11] DBI_1.1.1                   lazyeval_0.2.2             
## [13] BiocGenerics_0.38.0         colorspace_2.0-2           
## [15] tidyselect_1.1.1            DESeq2_1.32.0              
## [17] bit_4.0.4                   compiler_4.1.2             
## [19] Biobase_2.52.0              DelayedArray_0.18.0        
## [21] plotly_4.10.0               rtracklayer_1.52.1         
## [23] labeling_0.4.2              sass_0.4.0                 
## [25] scales_1.1.1                genefilter_1.74.1          
## [27] stringr_1.4.0               digest_0.6.28              
## [29] Rsamtools_2.8.0             rmarkdown_2.11             
## [31] XVector_0.32.0              pkgconfig_2.0.3            
## [33] htmltools_0.5.2             MatrixGenerics_1.4.3       
## [35] fastmap_1.1.0               htmlwidgets_1.5.4          
## [37] rlang_0.4.12                RSQLite_2.2.8              
## [39] jquerylib_0.1.4             BiocIO_1.2.0               
## [41] generics_0.1.1              jsonlite_1.7.2             
## [43] crosstalk_1.1.1             BiocParallel_1.26.2        
## [45] RCurl_1.98-1.5              magrittr_2.0.1             
## [47] GenomeInfoDbData_1.2.6      Matrix_1.3-4               
## [49] Rcpp_1.0.7                  munsell_0.5.0              
## [51] S4Vectors_0.30.2            fansi_0.5.0                
## [53] lifecycle_1.0.1             stringi_1.7.5              
## [55] yaml_2.2.1                  SummarizedExperiment_1.22.0
## [57] zlibbioc_1.38.0             grid_4.1.2                 
## [59] blob_1.2.2                  parallel_4.1.2             
## [61] crayon_1.4.1                lattice_0.20-45            
## [63] Biostrings_2.60.2           splines_4.1.2              
## [65] annotate_1.70.0             KEGGREST_1.32.0            
## [67] locfit_1.5-9.4              pillar_1.6.4               
## [69] GenomicRanges_1.44.0        rjson_0.2.20               
## [71] geneplotter_1.70.0          stats4_4.1.2               
## [73] XML_3.99-0.8                glue_1.4.2                 
## [75] evaluate_0.14               data.table_1.14.2          
## [77] png_0.1-7                   vctrs_0.3.8                
## [79] gtable_0.3.0                purrr_0.3.4                
## [81] tidyr_1.1.4                 assertthat_0.2.1           
## [83] cachem_1.0.6                ggplot2_3.3.5              
## [85] xfun_0.27                   xtable_1.8-4               
## [87] restfulr_0.0.13             survival_3.2-13            
## [89] viridisLite_0.4.0           tibble_3.1.5               
## [91] GenomicAlignments_1.28.0    AnnotationDbi_1.54.1       
## [93] plyranges_1.12.1            memoise_2.0.0              
## [95] IRanges_2.26.0              ellipsis_0.3.2